{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import tarfile\n",
    "import urllib"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "默认路径 = os.path.join(\"datasets\", \"housing\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def 载入房屋数据(存储路径 = 默认数据存储路径):\n",
    "    csv_路径 = os.path.join(存储路径, \"housing.csv\")\n",
    "    return pd.read_csv(csv_路径)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "房屋数据 = 载入房屋数据()\n",
    "房屋数据.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "房屋数据.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "房屋数据[\"ocean_proximity\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "房屋数据.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "import matplotlib.pyplot as 绘图\n",
    "房屋数据.hist(bins=50, figsize=(20, 15))\n",
    "绘图.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "def 分割训练集与测试集(数据, 测试集占比):\n",
    "    随机序号 = np.random.permutation(len(数据))\n",
    "    测试集大小 = int(len(数据) * 测试集占比)\n",
    "    测试集序号 = 随机序号[:测试集大小]\n",
    "    训练集序号 = 随机序号[测试集大小:]\n",
    "    return 数据.iloc[训练集序号], 数据.iloc[测试集序号]\n",
    "    #pandas数据选取方法iloc(行序号/数组, 列序号/数组)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "训练集, 测试集 = 分割训练集与测试集(房屋数据, 0.2)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#更新数据后, 保持原测试集不变\n",
    "from zlib import crc32\n",
    "\n",
    "def 测试集检测(标识符, 测试集占比):\n",
    "    return crc32(np.int64(标识符)) & 0xffffffff < 测试集占比 * 2**32\n",
    "\n",
    "def 通过id分割训练集与测试集(数据, 测试集占比, id所在列):\n",
    "    # id == 标识符\n",
    "    标识符集 = 数据[id所在列]\n",
    "    属于测试集 = 标识符集.apply(lambda 标识符: 测试集检测(标识符, 测试集占比))\n",
    "    return 数据.loc[~属于测试集], 数据.loc[属于测试集]\n",
    "    # ~ 按位取反\n",
    "    \n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "房屋数据_带序号 = 房屋数据.reset_index() #添加'index'列\n",
    "训练集, 测试集 = 通过id分割训练集与测试集(房屋数据_带序号, 0.2,\n",
    "                        \"index\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#若数据可能被删除, 序号易改变, 则可考虑使用经纬度构造id\n",
    "房屋数据_带序号[\"id\"] = 房屋数据[\"longitude\"] * 1000 + 房屋数据[\"latitude\"]\n",
    "训练集, 测试集 = 通过id分割训练集与测试集(房屋数据_带序号, 0.2,\n",
    "                        \"id\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#测试分割结果\n",
    "#print(str(len(训练集)) + '+' + str(len(测试集)) + '=' + str(len(房屋数据_带序号)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split as 分割训练与测试集\n",
    "训练集, 测试集 = 分割训练与测试集(房屋数据, test_size=0.2, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "房屋数据[\"收入分类\"] = pd.cut(房屋数据[\"median_income\"],\n",
    "                            bins=[0.0,1.5,3.0,4.5,6.,np.inf],\n",
    "                             labels=[1,2,3,4,5])\n",
    "房屋数据[\"收入分类\"].hist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import StratifiedShuffleSplit as 分层随机分割\n",
    "\n",
    "分割 = 分层随机分割(n_splits=1, test_size=0.2, random_state=42)\n",
    "for 训练集序号, 测试集序号 in 分割.split(房屋数据, 房屋数据[\"收入分类\"]):\n",
    "    分层训练集 = 房屋数据.loc[训练集序号]\n",
    "    分层测试集 = 房屋数据.loc[测试集序号]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "分层测试集[\"收入分类\"].value_counts() / len(分层测试集)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "分层训练集[\"收入分类\"].value_counts() / len(分层训练集)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for 数据集 in (分层训练集, 分层测试集):\n",
    "    数据集.drop(\"收入分类\", axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "房屋数据 = 分层训练集.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "房屋数据.plot(kind=\"scatter\", x=\"longitude\", y=\"latitude\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "房屋数据.plot(kind=\"scatter\", x=\"longitude\", y=\"latitude\", alpha=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "房屋数据.plot(kind=\"scatter\", \n",
    "          x=\"longitude\", y=\"latitude\", alpha=0.4,\n",
    "          s=房屋数据['population']/100,\n",
    "          label='人口', figsize=(10, 7),\n",
    "          c='median_house_value', cmap=plt.get_cmap('jet'),\n",
    "          colorbar=True)\n",
    "plt.legend()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "相关系数矩阵 = 房屋数据.corr()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "相关系数矩阵['median_house_value'].sort_values(ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pandas.plotting import scatter_matrix as 散点图矩阵\n",
    "属性 = ['median_house_value', 'median_income', 'total_rooms',\n",
    "        'housing_median_age']\n",
    "散点图矩阵(房屋数据[属性], figsize=(12, 8))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "房屋数据.plot(kind='scatter', x='median_income',\n",
    "             y='median_house_value', alpha=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "房屋数据['每户房间数'] = 房屋数据['total_rooms'] / 房屋数据['households']\n",
    "\n",
    "房屋数据['每房间卧室数'] = 房屋数据['total_bedrooms'] / 房屋数据['total_rooms']\n",
    "\n",
    "房屋数据['每户人口'] = 房屋数据['population'] / 房屋数据['households']\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#'每户房间数','每房间卧室数','每户人口'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "#房屋数据.drop(['每户房间数', '每房间卧室数', '每户人口'], axis='columns')\n",
    "相关系数矩阵 = 房屋数据.corr()\n",
    "相关系数矩阵['median_house_value'].sort_values(ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "房屋数据 = 分层训练集.drop(\"median_house_value\", axis=1)\n",
    "房屋数据_标记 = 分层训练集['median_house_value'].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "如何处理含空值数据?\n",
    "选项一 去掉含空值的街区\n",
    "房屋数据.dropna(subset=['total_bedrooms']) \n",
    "选项二 去掉含空值的整个属性\n",
    "房屋数据.drop('total_bedrooms', axis=1)\n",
    "选项三 给空值指定数值\n",
    "中位数 = 住房数据['total_bedrooms'].median()\n",
    "住房数据['total_bedrooms'].fillna(中位数, implace=True)\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.impute import SimpleImputer\n",
    "\n",
    "平均值填空器 = SimpleImputer(strategy='median')\n",
    "房屋数据_仅数值 = 房屋数据.drop('ocean_proximity', axis=1)\n",
    "平均值填空器.fit(房屋数据_仅数值)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "平均值填空器.statistics_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "房屋数据_仅数值.median().values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = 平均值填空器.transform(房屋数据_仅数值)\n",
    "房屋数据_转换后 = pd.DataFrame(X, columns=房屋数据_仅数值.columns,\n",
    "                             index=房屋数据_仅数值.index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "房屋数据_类别 = 房屋数据[['ocean_proximity']]\n",
    "房屋数据_类别.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import OrdinalEncoder\n",
    "转数值编码器 = OrdinalEncoder()\n",
    "房屋数据_类别_已编码 = 转数值编码器.fit_transform(房屋数据_类别)\n",
    "房屋数据_类别_已编码[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "转数值编码器.categories_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import OneHotEncoder\n",
    "类别编码器 = OneHotEncoder()\n",
    "房屋数据_类别_独热 = 类别编码器.fit_transform(房屋数据_类别)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "房屋数据_类别_独热"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "房屋数据_类别_独热.toarray()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.base import BaseEstimator, TransformerMixin\n",
    "\n",
    "rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6\n",
    "\n",
    "class 属性组合器(BaseEstimator, TransformerMixin):\n",
    "    def __init__(self, add_bedrooms_per_room = True):\n",
    "        self.add_bedrooms_per_room = add_bedrooms_per_room\n",
    "    \n",
    "    def fit(self, X, y=None):\n",
    "        return self\n",
    "    \n",
    "    def transform(self, X, y=None):\n",
    "        rooms_per_household = X[:, population_ix] / X[:, households_ix]\n",
    "        population_per_household = X[:, population_ix] / X[:, households_ix]\n",
    "        if self.add_bedrooms_per_room:\n",
    "            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]\n",
    "            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]\n",
    "        else:\n",
    "            return np.c_[X, rooms_per_household, population_per_household]\n",
    "    \n",
    "   "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "attr_adder = 属性组合器(add_bedrooms_per_room=False)\n",
    "房屋数据_含额外属性 = attr_adder.transform(房屋数据.values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.pipeline import Pipeline as 流水线\n",
    "from sklearn.preprocessing import StandardScaler as 规范器\n",
    "#流水线(Pipeline) 按顺序依次用函数处理数据\n",
    "数值流水线 = 流水线([\n",
    "    ('平均值填空', SimpleImputer(strategy='median')),\n",
    "    ('属性增加', 属性组合器()),\n",
    "    ('规范化', 规范器()),\n",
    "])\n",
    "房屋数据_仅数值_转换后 = 数值流水线.fit_transform(房屋数据_仅数值)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.compose import ColumnTransformer as 列转换器\n",
    "\n",
    "数值属性 = list(房屋数据_仅数值)\n",
    "类别属性 = ['ocean_proximity']\n",
    "独热编码器 = OneHotEncoder\n",
    "\n",
    "完整流水线 = 列转换器([\n",
    "    ('数值', 数值流水线, 数值属性),\n",
    "    ('类别', 独热编码器(), 类别属性),\n",
    "])\n",
    "\n",
    "房屋数据_预处理后 = 完整流水线.fit_transform(房屋数据)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#用线性模型训练数据\n",
    "from sklearn.linear_model import LinearRegression\n",
    "线性回归 = LinearRegression()\n",
    "线性回归.fit(房屋数据_预处理后, 房屋数据_标记)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#测试线性回归模型\n",
    "一些数据 = 房屋数据.iloc[:5]\n",
    "一些标记 = 房屋数据_标记.iloc[:5]\n",
    "一些数据_预处理后 = 完整流水线.transform(一些数据)\n",
    "线性回归预测值 = 线性回归.predict(一些数据_预处理后)\n",
    "原标记值 = list(一些标记)\n",
    "print(\"预测值: \", [int(x) for x in 线性回归预测值])\n",
    "print(\"标记值: \", [int(x) for x in 原标记值])\n",
    "print(\"误差率: \", [' ' + str(int((原标记值[i] - 线性回归预测值[i]) * 100 / 原标记值[i])) + '%' for i in range(len(原标记值))])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#计算线性模型的标准差\n",
    "from sklearn.metrics import mean_squared_error as 均方误差\n",
    "房屋数据_预测值 = 线性回归.predict(房屋数据_预处理后)\n",
    "线性_均方误差 = 均方误差(房屋数据_标记, 房屋数据_预测值)\n",
    "线性_均方根误差 = np.sqrt(线性_均方误差)\n",
    "print(线性_均方根误差)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#用决策树回归器模型训练\n",
    "from sklearn.tree import DecisionTreeRegressor\n",
    "决策树回归器 = DecisionTreeRegressor()\n",
    "决策树回归器.fit(房屋数据_预处理后, 房屋数据_标记)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#计算决策树模型的均方根误差\n",
    "房屋数据_预测值 = 决策树回归器.predict(房屋数据_预处理后)\n",
    "决策树_均方误差 = 均方误差(房屋数据_标记, 房屋数据_预测值)\n",
    "决策树_均方根误差 = np.sqrt(决策树_均方误差)\n",
    "print(决策树_均方根误差)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#10折交叉验证训练决策树\n",
    "from sklearn.model_selection import cross_val_score as 交叉验证打分\n",
    "\n",
    "负均方误差 = 'neg_mean_squared_error'\n",
    "\n",
    "得分 = 交叉验证打分(决策树回归器, 房屋数据_预处理后, 房屋数据_标记,\n",
    "                   scoring=负均方误差, cv=10)\n",
    "决策树_均方根_得分 = np.sqrt(-得分) \n",
    "#打分用的是功效函数,即负均方根, 得分一般用损失函数,故取负"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def 显示分数(得分):\n",
    "    print(\"得分:\", 得分)\n",
    "    print('平均值:', 得分.mean())\n",
    "    print('标准差:', 得分.std())\n",
    "显示分数(决策树_均方根_得分)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "线性回归_得分 = 交叉验证打分(线性回归, 房屋数据_预处理后, 房屋数据_标记,\n",
    "                        scoring=负均方误差, cv=10)\n",
    "线性回归_均方根_得分 = np.sqrt(-线性回归_得分)\n",
    "显示分数(线性回归_均方根_得分)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#用随机森林回归器训练\n",
    "from sklearn.ensemble import RandomForestRegressor\n",
    "随机森林回归 = RandomForestRegressor()\n",
    "随机森林回归.fit(房屋数据_预处理后, 房屋数据_标记)\n",
    "\n",
    "\n",
    "随机森林_得分 = 交叉验证打分(随机森林回归, 房屋数据_预处理后, 房屋数据_标记,\n",
    "                        scoring=负均方误差, cv=10)\n",
    "随机森林_均方根_得分 = np.sqrt(-随机森林_得分)\n",
    "显示分数(随机森林_均方根_得分)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "房屋数据_预测值 = 随机森林回归.predict(房屋数据_预处理后)\n",
    "随机森林_均方误差 = 均方误差(房屋数据_标记, 房屋数据_预测值)\n",
    "随机森林_均方根误差 = np.sqrt(随机森林_均方误差)\n",
    "print(随机森林_均方根误差)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#保存模型\n",
    "import joblib\n",
    "joblib.dump(随机森林回归, '随机森林回归.pkl')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#精校参数 Grid Search 翻译为网格穷举比较好\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "\n",
    "网格参数 = [\n",
    "    {'n_estimators':[3, 10, 30], 'max_features':[2, 4, 6, 8]},\n",
    "    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},\n",
    "]\n",
    "随机森林回归 = RandomForestRegressor()\n",
    "网格穷举 = GridSearchCV(随机森林回归, 网格参数, cv=5,\n",
    "                   scoring=负均方误差,\n",
    "                   return_train_score=True)\n",
    "网格穷举.fit(房屋数据_预处理后, 房屋数据_标记)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(网格穷举.best_params_)\n",
    "#与书本不一样的原因: 添加重复属性"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(list(房屋数据_预处理后))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(网格穷举.best_estimator_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "验证结果 = 网格穷举.cv_results_\n",
    "for 平均分, 参数 in zip(验证结果['mean_test_score'],\n",
    "                  验证结果['params']):\n",
    "    print(np.sqrt(-平均分), 参数)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "特征重要性 = 网格穷举.best_estimator_.feature_importances_\n",
    "print(特征重要性)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#输出每属性的重要性\n",
    "额外属性 = ['每户房间数','每户人口','每房间卧室数']\n",
    "类别编码器 = 完整流水线.named_transformers_['类别']\n",
    "类别_独热属性 = list(类别编码器.categories_[0])\n",
    "属性 = 数值属性 + 额外属性 + 类别_独热属性\n",
    "for x in sorted(zip(特征重要性, 属性), reverse=True):\n",
    "    print('{}\\n'.format(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "最终模型 = 网格穷举.best_estimator_\n",
    "\n",
    "测试集特征 = 分层测试集.drop(\"median_house_value\", axis='columns')\n",
    "测试集标记 = 分层测试集[\"median_house_value\"].copy()\n",
    "\n",
    "测试集特征_预处理后 = 完整流水线.transform(测试集特征)\n",
    "\n",
    "最终预测值 = 最终模型.predict(测试集特征_预处理后)\n",
    "\n",
    "最终均方误差 = 均方误差(测试集标记, 最终预测值)\n",
    "最终均方根误差 = np.sqrt(最终均方误差)\n",
    "print(最终均方根误差)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#求置信区间\n",
    "from scipy import stats as 统计\n",
    "置信度 = 0.95\n",
    "方差 = (最终预测值 - 测试集标记) ** 2\n",
    "置信区间 = np.sqrt(统计.t.interval(置信度, len(方差) - 1,\n",
    "                            loc=方差.mean(),\n",
    "                            scale=统计.sem(方差)))\n",
    "print(置信区间)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
